This project analyzes the General Services Administration's Inventory of Owned and Leased Properties (IOLP), a public dataset covering federal properties across the United States, Puerto Rico, Guam, and American Samoa.
My goal is to understand the composition of GSA's real estate portfolio, identify lease expiration risk, and surface the properties that require the most urgent attention from a portfolio management perspective.
Data source: data.gov: Inventory of Owned and Leased Properties
This cell is commented out after the first run to avoid re-installing on every execution.
#!pip install --upgrade pandas numpy matplotlib plotly "numpy<2.0"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import plotly.io as pio
pio.renderers.default = "notebook"
from pathlib import Path
import os
from datetime import datetime
DATA_DIR = Path(os.getenv('DATA_DIR', 'data/raw'))
OUTPUT_DIR = Path(os.getenv('OUTPUT_DIR', 'data/processed'))
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
BUILDINGS_FILE = os.getenv('BUILDINGS_FILE', 'iolp-buildings.xlsx')
LEASES_FILE = os.getenv('LEASES_FILE', 'iolp-leases.xlsx')
# 'As of' date used for days to expiration
AS_OF_DATE_STR = os.getenv('AS_OF_DATE', '')
AS_OF_DATE = pd.to_datetime(AS_OF_DATE_STR).normalize() if AS_OF_DATE_STR else pd.Timestamp('today').normalize()
def resolve_input_path(filename: str) -> Path:
"""Resolve input file path with a few sensible fallbacks."""
p = DATA_DIR / filename
if p.exists():
return p
p2 = Path(filename)
if p2.exists():
return p2
p3 = Path('/mnt/data') / filename
if p3.exists():
return p3
raise FileNotFoundError(f"Could not find {filename}. Checked: {DATA_DIR/filename}, ./{filename}, /mnt/data/{filename}")
BUILDINGS_PATH = resolve_input_path(BUILDINGS_FILE)
LEASES_PATH = resolve_input_path(LEASES_FILE)
print('Using inputs:')
print(BUILDINGS_PATH)
print(LEASES_PATH)
print('As of date:', AS_OF_DATE.date())
Using inputs: data/raw/iolp-buildings.xlsx data/raw/iolp-leases.xlsx As of date: 2026-02-24
# Load the raw data
buildings_df = pd.read_excel(BUILDINGS_PATH)
leases_df = pd.read_excel(LEASES_PATH)
print(f"Buildings loaded: {len(buildings_df)} rows, {len(buildings_df.columns)} columns")
print(f"Leases loaded: {len(leases_df)} rows, {len(leases_df.columns)} columns")
print("\n -> Buildings dataset columns:")
print(buildings_df.columns.tolist())
print("\n -> Leases dataset columns:")
print(leases_df.columns.tolist())
Buildings loaded: 8504 rows, 18 columns Leases loaded: 7381 rows, 19 columns -> Buildings dataset columns: ['Location Code', 'Real Property Asset Name', 'Installation Name', 'Owned or Leased', 'GSA Region', 'Street Address', 'City', 'State', 'Zip Code', 'Latitude', 'Longitude', 'Building Rentable Square Feet', 'Available Square Feet', 'Construction Date', 'Congressional District', 'Congressional District Representative Name', 'Building Status', 'Real Property Asset Type'] -> Leases dataset columns: ['Location Code', 'Real Property Asset Name', 'Installation Name', 'Federal Leased Code', 'GSA Region', 'Street Address', 'City', 'State', 'Zip Code', 'Latitude', 'Longitude', 'Building Rentable Square Feet', 'Available Square Feet', 'Congressional District', 'Congressional District Representative', 'Lease Number', 'Lease Effective Date', 'Lease Expiration Date', 'Real Property Asset type']
# Data quality before cleaning
print("Buildings dataset info:")
print(f" -> Shape: {buildings_df.shape}")
print(f" -> Missing values per column:")
print(buildings_df.isnull().sum()[buildings_df.isnull().sum() > 0])
print(f"\nLeases dataset info:")
print(f" -> Shape: {leases_df.shape}")
print(f" -> Missing values per column:")
print(leases_df.isnull().sum()[leases_df.isnull().sum() > 0])
Buildings dataset info: -> Shape: (8504, 18) -> Missing values per column: Installation Name 7287 Available Square Feet 8504 Construction Date 6783 dtype: int64 Leases dataset info: -> Shape: (7381, 19) -> Missing values per column: Installation Name 7332 Available Square Feet 7381 Congressional District Representative 13 dtype: int64
# Make a copy to preserve original
buildings_clean = buildings_df.copy()
# Standardize column names
buildings_clean.columns = buildings_clean.columns.str.strip()
print("Column names standardized! \n")
# Clean numeric columns
numeric_cols = ['Building Rentable Square Feet', 'Available Square Feet',
'Latitude', 'Longitude', 'Construction Date']
for col in numeric_cols:
if col in buildings_clean.columns:
before_count = buildings_clean[col].count()
buildings_clean[col] = pd.to_numeric(buildings_clean[col], errors='coerce')
after_count = buildings_clean[col].count()
if before_count != after_count:
print(f"{col}: {before_count - after_count} values converted to NaN")
# Clean categorical data
if 'Owned or Leased' in buildings_clean.columns:
buildings_clean['Owned or Leased'] = buildings_clean['Owned or Leased'].str.upper()
ownership_counts = buildings_clean['Owned or Leased'].value_counts()
print(f"Ownership breakdown: {ownership_counts.to_dict()}")
# Create ownership type mapping
buildings_clean['Ownership_Type'] = buildings_clean['Owned or Leased'].map({
'F': 'Federal_Owned',
'L': 'Leased'
})
# Clean state data
if 'State' in buildings_clean.columns:
buildings_clean['State'] = buildings_clean['State'].str.upper().str.strip()
print(f"States cleaned: {buildings_clean['State'].nunique()} unique states" "\n" )
print("Buildings data cleaning completed!")
Column names standardized!
Ownership breakdown: {'L': 6748, 'F': 1756}
States cleaned: 56 unique states
Buildings data cleaning completed!
Lease_Term_Years : total contracted duration of each leaseDays_Until_Expiration : days remaining from the as-of date (negative = already expired)Expiration_Category : a bucketed urgency label (Expired / Critical_1yr / Warning_2yr / Plan_5yr / Long_term)# Clean leases data
print("CLEANING LEASES DATA: \n")
# Make a copy
leases_clean = leases_df.copy()
# Standardize column names
leases_clean.columns = leases_clean.columns.str.strip()
print(" Column names standardized! \n")
# Convert date columns
date_cols = ['Lease Effective Date', 'Lease Expiration Date']
for col in date_cols:
if col in leases_clean.columns:
before_count = leases_clean[col].count()
leases_clean[col] = pd.to_datetime(leases_clean[col], errors='coerce')
after_count = leases_clean[col].count()
print(f"{col}: {after_count} valid dates ({before_count - after_count} invalid)")
# Calculate lease metrics
if all(col in leases_clean.columns for col in date_cols):
# Lease term in years
leases_clean['Lease_Term_Years'] = (
(leases_clean['Lease Expiration Date'] -
leases_clean['Lease Effective Date']).dt.days / 365.25
).round(1)
# Days until expiration
from datetime import datetime
current_date = datetime.now()
leases_clean['Days_Until_Expiration'] = (
leases_clean['Lease Expiration Date'] - current_date
).dt.days
# Expiration categories
leases_clean['Expiration_Category'] = pd.cut(
leases_clean['Days_Until_Expiration'],
bins=[-np.inf, 0, 365, 730, 1825, np.inf],
labels=['Expired', 'Critical_1yr', 'Warning_2yr', 'Plan_5yr', 'Long_term']
)
print("\nLease metrics calculated:")
print(f" Average lease term: {leases_clean['Lease_Term_Years'].mean():.1f} years")
print(f" Expiration categories:")
print(leases_clean['Expiration_Category'].value_counts().to_dict())
print("\n Leases data cleaning completed!")
CLEANING LEASES DATA:
Column names standardized!
Lease Effective Date: 7381 valid dates (0 invalid)
Lease Expiration Date: 7381 valid dates (0 invalid)
Lease metrics calculated:
Average lease term: 14.1 years
Expiration categories:
{'Long_term': 3577, 'Plan_5yr': 1907, 'Critical_1yr': 970, 'Warning_2yr': 802, 'Expired': 125}
Leases data cleaning completed!
Joining the buildings and leases datasets on Location Code, keeping only the latest expiration date per location before merging to avoid duplicating row counts.
# Merge datasets
lease_cols = ['Location Code', 'Lease Number', 'Lease Effective Date',
'Lease Expiration Date', 'Lease_Term_Years',
'Days_Until_Expiration', 'Expiration_Category']
# Only include columns that exist
available_lease_cols = [col for col in lease_cols if col in leases_clean.columns]
print(f"Merging columns: {available_lease_cols}")
# Join check
if 'Location Code' in leases_clean.columns:
dup_counts = leases_clean['Location Code'].value_counts()
n_dupe_keys = int((dup_counts > 1).sum())
if n_dupe_keys > 0:
print(f"Note: {n_dupe_keys:,} Location Codes have multiple lease records. "
"Aggregating to ONE lease row per Location Code (latest expiration).")
# Keep latest expiration per Location Code
leases_for_merge = leases_clean[available_lease_cols].copy()
# Add record count per location
leases_for_merge['Lease_Record_Count'] = leases_for_merge.groupby('Location Code')['Location Code'].transform('size')
sort_cols = [c for c in ['Lease Expiration Date', 'Lease Effective Date'] if c in leases_for_merge.columns]
if sort_cols:
leases_for_merge = leases_for_merge.sort_values(sort_cols, ascending=[False]*len(sort_cols))
leases_one = leases_for_merge.drop_duplicates(subset=['Location Code'], keep='first')
else:
leases_one = leases_clean[available_lease_cols].copy()
master_df = buildings_clean.merge(
leases_one,
on='Location Code',
how='left'
)
print("Master dataset created: ")
print(f" -> Total records: {len(master_df):,}")
print(f" -> Total columns: {len(master_df.columns):,}")
if 'Lease Number' in master_df.columns:
print(f" -> Properties with lease data: {master_df['Lease Number'].notna().sum():,}")
Merging columns: ['Location Code', 'Lease Number', 'Lease Effective Date', 'Lease Expiration Date', 'Lease_Term_Years', 'Days_Until_Expiration', 'Expiration_Category'] Note: 696 Location Codes have multiple lease records. Aggregating to ONE lease row per Location Code (latest expiration). Master dataset created: -> Total records: 8,504 -> Total columns: 26 -> Properties with lease data: 6,257
Lease_Risk_Score (0 to 100) for every leased property, weighted across three dimensions:
| Component | Weight | Logic |
|---|---|---|
| Expiration Proximity | 60% | Leases expiring sooner score higher |
| Size Exposure | 30% | Larger properties score higher (percentile-ranked) |
| Utilization Pressure | 10% | Fully occupied buildings score higher |
Three tiers: Low (< 50), Medium (50–80), High (> 80).
# Lease renewal risk scoring — composite score (0 to 100) across three weighted components
HORIZON_DAYS = int(os.getenv('RISK_HORIZON_DAYS', str(5*365))) # Planning horizon (default: 5 years)
W_EXPIRY = float(os.getenv('W_EXPIRY', '0.60')) # 60% weight: how soon does it expire?
W_SIZE = float(os.getenv('W_SIZE', '0.30')) # 30% weight: how large is the property?
W_UTIL = float(os.getenv('W_UTIL', '0.10')) # 10% weight: how occupied is it?
master_df['Lease_Risk_Score'] = np.nan
master_df['Lease_Risk_Tier'] = np.nan
if {'Ownership_Type','Days_Until_Expiration','Building Rentable Square Feet'}.issubset(master_df.columns):
leased_mask = master_df['Ownership_Type'].eq('Leased') & master_df['Days_Until_Expiration'].notna()
leased = master_df.loc[leased_mask].copy()
# Component 1: Expiration proximity, leases expiring sooner get a higher score
days = leased['Days_Until_Expiration'].clip(lower=0)
expiry_score = (1 - (days / HORIZON_DAYS)).clip(lower=0, upper=1)
# Component 2: Size exposure, larger properties carry more operational risk; percentile-ranked
sqft = pd.to_numeric(leased['Building Rentable Square Feet'], errors='coerce').fillna(0)
size_score = sqft.rank(pct=True).fillna(0)
# Component 3: Utilization pressure, fully occupied buildings leave less flexibility
# Utilization defaults to neutral (0.5)
if {'Available Square Feet','Building Rentable Square Feet'}.issubset(leased.columns):
avail = pd.to_numeric(leased['Available Square Feet'], errors='coerce')
rent = pd.to_numeric(leased['Building Rentable Square Feet'], errors='coerce')
vacancy_pct = (avail / rent).replace([np.inf, -np.inf], np.nan).fillna(0).clip(0, 1)
util_score = (1 - vacancy_pct).clip(0, 1)
else:
util_score = pd.Series(0.5, index=leased.index) # Neutral fallback if vacancy data not available
# Composite score
score = 100 * (W_EXPIRY*expiry_score + W_SIZE*size_score + W_UTIL*util_score)
leased['Lease_Risk_Score'] = score.round(1)
# Assign risk tiers: Low < 50, Medium 50-80, High > 80
leased['Lease_Risk_Tier'] = pd.cut(
leased['Lease_Risk_Score'],
bins=[-np.inf, 50, 80, np.inf],
labels=['Low','Medium','High']
)
master_df.loc[leased.index, 'Lease_Risk_Score'] = leased['Lease_Risk_Score']
master_df.loc[leased.index, 'Lease_Risk_Tier'] = leased['Lease_Risk_Tier'].astype(str)
print('Lease risk score assigned to all leased properties:')
print(master_df.loc[leased.index, 'Lease_Risk_Tier'].value_counts(dropna=False))
else:
print('Skipped risk scoring: required columns not present.')
Lease risk score assigned to all leased properties: Lease_Risk_Tier Low 4246 Medium 1592 High 419 Name: count, dtype: int64
Exporting the key datasets to the processed folder
# Save cleaned datasets
buildings_clean.to_excel(OUTPUT_DIR / 'buildings_cleaned.xlsx', index=False)
leases_clean.to_excel(OUTPUT_DIR / 'leases_cleaned.xlsx', index=False)
master_df.to_excel(OUTPUT_DIR / 'master_dataset.xlsx', index=False)
# Generate portfolio summary
print("*** PORTFOLIO SUMMARY ***\n")
# Basic statistics
total_properties = len(master_df)
total_sqft = master_df['Building Rentable Square Feet'].sum()
print(f" -> PORTFOLIO OVERVIEW:")
print(f" Total Properties: {total_properties:,}")
print(f" Total Rentable Sq Ft: {total_sqft:,.0f}")
print(f" Average Property Size: {total_sqft/total_properties:,.0f} sq ft")
# Ownership breakdown
if 'Ownership_Type' in master_df.columns:
ownership_summary = master_df['Ownership_Type'].value_counts()
print(f"\n -> OWNERSHIP BREAKDOWN:")
for ownership, count in ownership_summary.items():
pct = (count / total_properties) * 100
sqft = master_df[master_df['Ownership_Type'] == ownership]['Building Rentable Square Feet'].sum()
print(f" {ownership}: {count:,} ({pct:.1f}%) ~ {sqft:,.0f} sq ft")
# Top states
if 'State' in master_df.columns:
top_states = master_df['State'].value_counts().head(10)
print(f"\n -> TOP 10 STATES BY PROPERTY COUNT:")
for state, count in top_states.items():
pct = (count / total_properties) * 100
print(f" {state}: {count:,} ({pct:.1f}%)")
# GSA regions
if 'GSA Region' in master_df.columns:
region_summary = master_df['GSA Region'].value_counts().sort_index()
print(f"\n -> GSA REGION DISTRIBUTION:")
for region, count in region_summary.items():
pct = (count / total_properties) * 100
print(f" Region {region}: {count:,} ({pct:.1f}%)")
*** PORTFOLIO SUMMARY *** -> PORTFOLIO OVERVIEW: Total Properties: 8,504 Total Rentable Sq Ft: 359,406,841 Average Property Size: 42,263 sq ft -> OWNERSHIP BREAKDOWN: Leased: 6,748 (79.4%) ~ 173,509,102 sq ft Federal_Owned: 1,756 (20.6%) ~ 185,897,739 sq ft -> TOP 10 STATES BY PROPERTY COUNT: TX: 901 (10.6%) CA: 706 (8.3%) FL: 385 (4.5%) NY: 352 (4.1%) VA: 335 (3.9%) MD: 308 (3.6%) DC: 275 (3.2%) CO: 237 (2.8%) PA: 234 (2.8%) IL: 225 (2.6%) -> GSA REGION DISTRIBUTION: Region 1: 391 (4.6%) Region 2: 517 (6.1%) Region 3: 761 (8.9%) Region 4: 1,365 (16.1%) Region 5: 969 (11.4%) Region 6: 385 (4.5%) Region 7: 1,380 (16.2%) Region 8: 653 (7.7%) Region 9: 1,024 (12.0%) Region 10: 474 (5.6%) Region 11: 585 (6.9%)
print("Executive Summary Analysis: \n")
# Calculate key metrics from master_df
total_properties = len(master_df)
total_sqft = master_df['Building Rentable Square Feet'].sum()
owned_properties = len(master_df[master_df['Ownership_Type'] == 'Federal_Owned'])
leased_properties = len(master_df[master_df['Ownership_Type'] == 'Leased'])
# Leases expiring analysis
leases_expiring_1yr = len(master_df[
(master_df['Days_Until_Expiration'] > 0) &
(master_df['Days_Until_Expiration'] <= 365)
])
leases_expiring_2yr = len(master_df[
(master_df['Days_Until_Expiration'] > 0) &
(master_df['Days_Until_Expiration'] <= 730)
])
# Display key metrics
print(f" -> Total Properties: {total_properties:,}" "\n")
print(f" -> Total Square Feet: {total_sqft:,.0f}" "\n")
print(f" -> Federal Owned: {owned_properties:,} ({owned_properties/total_properties*100:.1f}%)" "\n")
print(f" -> Leased: {leased_properties:,} ({leased_properties/total_properties*100:.1f}%)" "\n")
print(f" -> Leases Expiring (1 year): {leases_expiring_1yr:,}" "\n")
print(f" -> Leases Expiring (2 years): {leases_expiring_2yr:,}" "\n")
print(f" -> Average Property Size: {total_sqft/total_properties:,.0f} sq ft" "\n")
print(f" -> States with Properties: {master_df['State'].nunique()}" "\n")
print(f" -> GSA Regions: {master_df['GSA Region'].nunique()}" "\n")
Executive Summary Analysis: -> Total Properties: 8,504 -> Total Square Feet: 359,406,841 -> Federal Owned: 1,756 (20.6%) -> Leased: 6,748 (79.4%) -> Leases Expiring (1 year): 778 -> Leases Expiring (2 years): 1,399 -> Average Property Size: 42,263 sq ft -> States with Properties: 56 -> GSA Regions: 11
The 'lease cliff' refers to the volume of lease expirations concentrated in a short window, looking at all active leases expiring within the next 5 years and break them down year by year to show where the pressure is greatest.
Using the risk tier from Section 5 to identify the properties that are expiring soon + large enough to create a real operational problem if renewal is delayed. The High tier list replaces the previous manual size+time filter, to prioritize which leases need attention first.
# Lease cliff analysis: year by year expiration volume and risk-prioritized critical list
print('*** Lease Cliff Analysis ***\n')
if 'Lease Expiration Date' in master_df.columns:
# Active leases expiring within the next 5 years
active_leases = master_df[
(master_df['Days_Until_Expiration'] > 0) &
(master_df['Days_Until_Expiration'] <= 1825)
].copy()
if len(active_leases) > 0:
print(f' -> Active leases expiring in next 5 years: {len(active_leases)}')
# Annual breakdown: properties and square footage at risk by year
active_leases['Expiration_Year'] = active_leases['Lease Expiration Date'].dt.year
annual_summary = active_leases.groupby('Expiration_Year').agg({
'Location Code': 'count',
'Building Rentable Square Feet': 'sum'
})
annual_summary.columns = ['Properties_Expiring', 'SqFt_at_Risk']
annual_summary['SqFt_at_Risk'] = annual_summary['SqFt_at_Risk'].astype(int)
print('\n -> Lease Expiration by Year (next 5 years)')
print(annual_summary.to_string())
# Critical leases: identified by High risk tier
if 'Lease_Risk_Tier' in active_leases.columns:
critical_leases = active_leases[
active_leases['Lease_Risk_Tier'] == 'High'
].sort_values('Lease_Risk_Score', ascending=False)
else:
# Fallback if risk scoring was skipped
critical_leases = active_leases[
(active_leases['Building Rentable Square Feet'] > 50000) &
(active_leases['Days_Until_Expiration'] <= 730)
].sort_values('Days_Until_Expiration')
print(f'\n -> High Risk Leases (by risk score): {len(critical_leases)}\n')
if len(critical_leases) > 0:
for _, lease in critical_leases.head(5).iterrows():
days = int(lease['Days_Until_Expiration'])
score = lease.get('Lease_Risk_Score', float('nan'))
print(f" {lease['Real Property Asset Name'][:35]:<35} | "
f"{lease['State']:2} | "
f"{lease['Building Rentable Square Feet']:>8,.0f} sq ft | "
f"{days:>4} days | "
f"Risk: {score:.1f}")
else:
print('No active leases found expiring in the next 5 years')
else:
print('Lease expiration data not available')
*** Lease Cliff Analysis ***
-> Active leases expiring in next 5 years: 2954
-> Lease Expiration by Year (next 5 years)
Properties_Expiring SqFt_at_Risk
Expiration_Year
2026 701 12833016
2027 629 12186604
2028 622 15723563
2029 562 13872137
2030 392 9715179
2031 48 1069905
-> High Risk Leases (by risk score): 355
PATRIOTS PLAZA III | DC | 330,000 sq ft | 17 days | Risk: 99.2
ONE AVIATION PLAZA | NY | 267,771 sq ft | 36 days | Risk: 98.5
BELTWAY LAKE PHASE I | TX | 110,492 sq ft | 27 days | Risk: 97.7
211 BUILDING | WI | 75,196 sq ft | 3 days | Risk: 97.6
BELTSVILLE WAREHOUSE | MD | 116,987 sq ft | 34 days | Risk: 97.5
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
# Get top 10 states by property count
top_states = master_df['State'].value_counts().head(10)
# Create horizontal bar chart
fig_states = px.bar(
x=top_states.values,
y=top_states.index,
orientation='h',
title='Top 10 States by Property Count',
labels={'x': 'Number of Properties', 'y': 'State'},
color=top_states.values,
color_continuous_scale= 'Viridis'
)
fig_states.update_layout(height=500)
fig_states.show()
Understanding the square footage concentration and what share of each state's portfolio is federally owned vs. leased. States with a low ownership rate are more exposed to lease renewal risk.
# State-level summary
state_summary = master_df.groupby('State').agg({
'Building Rentable Square Feet': 'sum',
'Location Code': 'count',
'Ownership_Type': lambda x: (x == 'Federal_Owned').sum()
}).reset_index()
state_summary.columns = ['State', 'Total_SqFt', 'Total_Properties', 'Owned_Properties']
state_summary['Leased_Properties'] = state_summary['Total_Properties'] - state_summary['Owned_Properties']
state_summary['Ownership_Rate'] = (state_summary['Owned_Properties'] / state_summary['Total_Properties'] * 100).round(1)
# Show top 10 states by square footage
top_sqft_states = state_summary.nlargest(10, 'Total_SqFt')
print("Top 10 States by Total SqFt: \n")
print(top_sqft_states[['State', 'Total_Properties', 'Total_SqFt', 'Ownership_Rate']].to_string(index=False))
# Create a pie chart of top 10 states
fig_geo_pie = px.pie(
top_sqft_states,
values='Total_SqFt',
names='State',
title='Top 10 States: Portfolio Distribution by SqFt'
)
fig_geo_pie.show()
Top 10 States by Total SqFt: State Total_Properties Total_SqFt Ownership_Rate DC 275 50443937.19 63.6 MD 308 28384271.60 38.6 CA 706 27929742.09 13.6 VA 335 23947782.66 7.5 TX 901 23582821.72 42.3 NY 352 18131287.86 19.0 MO 192 16183878.39 21.9 FL 385 11758548.07 7.0 PA 234 11230283.42 11.1 GA 201 10737372.09 16.9
Plotting all properties with valid coordinates on an interactive map, sized by rentable square footage and colored by ownership type
# Properties for performance
map_data = master_df.dropna(subset=['Latitude', 'Longitude']).sample(
min(9731, len(master_df))
)
print(f"Mapping {len(map_data)} properties")
# Create scatter map
fig_map = px.scatter_mapbox(
map_data,
lat='Latitude',
lon='Longitude',
size='Building Rentable Square Feet',
color='Ownership_Type',
hover_data=['Real Property Asset Name', 'State', 'GSA Region'],
mapbox_style='open-street-map',
title=f'GSA Properties: Geographic Distribution ({len(map_data)} properties)',
height=600,
size_max=15
)
# Center on US
fig_map.update_layout(
mapbox=dict(
center=dict(lat=39.8283, lon=-98.5795),
zoom=3
)
)
fig_map.show()
Mapping 8504 properties
Breaking down owned vs. leased square footage at the GSA region level. Some regions carry a much higher lease dependency than others, which translates directly into exposure when multiple leases expire in the same window.
# Portfolio mix by GSA Region
region_analysis = master_df.groupby(['GSA Region', 'Ownership_Type']).agg({
'Building Rentable Square Feet': 'sum',
'Location Code': 'count'
}).reset_index()
# Create stacked bar chart
fig_portfolio = px.bar(
region_analysis,
x='GSA Region',
y='Building Rentable Square Feet',
color='Ownership_Type',
title='Portfolio Mix by GSA Region: Owned vs Leased Square Footage',
labels={'Building Rentable Square Feet': 'Square Feet'},
color_discrete_map={
'Federal_Owned': 'blue',
'Leased': 'red'
}
)
fig_portfolio.show()
# Property size distribution
property_sizes = master_df[
(master_df['Building Rentable Square Feet'] > 0) &
(master_df['Building Rentable Square Feet'] < 100000)
].copy()
fig_size = px.histogram(
property_sizes,
x='Building Rentable Square Feet',
color='Ownership_Type',
nbins=30,
title='Property Size Distribution: Owned vs Leased (Under 100K sq ft)',
labels={'count': 'Number of Properties'}
)
fig_size.show()
Using the risk tiers to answer three questions:
import plotly.express as px
# Aggregate risk tier counts by GSA region
risk_by_region = (
master_df[master_df['Lease_Risk_Tier'].notna()]
.groupby(['GSA Region', 'Lease_Risk_Tier'])
.size()
.reset_index(name='Count')
)
# Order tiers for consistent legend display
tier_order = ['High', 'Medium', 'Low']
risk_by_region['Lease_Risk_Tier'] = pd.Categorical(
risk_by_region['Lease_Risk_Tier'], categories=tier_order, ordered=True
)
risk_by_region = risk_by_region.sort_values(['GSA Region', 'Lease_Risk_Tier'])
fig_risk_region = px.bar(
risk_by_region,
x='GSA Region',
y='Count',
color='Lease_Risk_Tier',
color_discrete_map={'High': '#d62728', 'Medium': '#ff7f0e', 'Low': '#2ca02c'},
barmode='group',
title='Lease Risk Tier Distribution by GSA Region',
labels={'Count': 'Number of Leased Properties', 'Lease_Risk_Tier': 'Risk Tier'}
)
fig_risk_region.update_layout(xaxis_title='GSA Region', yaxis_title='Number of Properties')
fig_risk_region.show()
This scatter plot visualizes every leased property in two dimensions: how soon it expires (x-axis) and how large it is (y-axis)
# Scatter plot: Days Until Expiration vs. Building Size, colored by risk tier
scatter_data = master_df[
master_df['Lease_Risk_Tier'].notna() &
master_df['Days_Until_Expiration'].notna() &
(master_df['Days_Until_Expiration'] > 0)
].copy()
fig_scatter = px.scatter(
scatter_data,
x='Days_Until_Expiration',
y='Building Rentable Square Feet',
color='Lease_Risk_Tier',
color_discrete_map={'High': '#d62728', 'Medium': '#ff7f0e', 'Low': '#2ca02c'},
hover_data=['Real Property Asset Name', 'State', 'GSA Region', 'Lease_Risk_Score'],
title='Lease Risk Landscape: Expiration Timeline vs. Property Size',
labels={
'Days_Until_Expiration': 'Days Until Expiration',
'Building Rentable Square Feet': 'Rentable Square Feet',
'Lease_Risk_Tier': 'Risk Tier'
},
height=600,
opacity=0.7
)
fig_scatter.update_layout(
xaxis_title='Days Until Expiration',
yaxis_title='Rentable Square Feet'
)
fig_scatter.show()
Ranking all High-tier leased properties by their risk score. This is the output of the risk framework, a prioritized watchlist of properties where the combination of expiration urgency and portfolio exposure is highest, exported this as high_risk_leases.xlsx in the final section.
# Ranked priority table of all High-risk leased properties
high_risk = master_df[
master_df['Lease_Risk_Tier'] == 'High'
].sort_values('Lease_Risk_Score', ascending=False).copy()
display_cols = [
'Real Property Asset Name', 'State', 'GSA Region',
'Building Rentable Square Feet', 'Days_Until_Expiration',
'Lease Expiration Date', 'Lease_Risk_Score'
]
# Only include columns that exist
display_cols = [c for c in display_cols if c in high_risk.columns]
print(f'High-Risk Leased Properties (Top 20 by Risk Score)\n')
print(f'Total High-Risk properties: {len(high_risk)}\n')
top20 = high_risk[display_cols].head(20)
top20['Building Rentable Square Feet'] = top20['Building Rentable Square Feet'].map('{:,.0f}'.format)
top20['Days_Until_Expiration'] = top20['Days_Until_Expiration'].map('{:.0f}'.format)
top20['Lease_Risk_Score'] = top20['Lease_Risk_Score'].map('{:.1f}'.format)
print(top20.to_string(index=False))
High-Risk Leased Properties (Top 20 by Risk Score)
Total High-Risk properties: 419
Real Property Asset Name State GSA Region Building Rentable Square Feet Days_Until_Expiration Lease Expiration Date Lease_Risk_Score
TWO CONSTITUTION SQUARE DC 11 575,790 -25 2026-01-31 99.9
JAMES POLK BUILDING VA 11 353,631 -25 2026-01-31 99.8
PATRIOTS PLAZA III DC 11 330,000 17 2026-03-14 99.2
AUTOMOBILE MAINT FAC DC 11 153,741 -21 2026-02-04 99.1
SAN ANTONIO FIELD OFFICE TX 7 148,584 -25 2026-01-31 99.1
POYDRAS CENTER LA1475 LA 7 127,230 -1 2026-02-24 98.8
ONE AVIATION PLAZA NY 2 267,771 36 2026-04-02 98.5
ATLANTIC CORPORATE PARK WEST TOWER VA 11 109,751 -25 2026-01-31 98.5
1616 NORTH FORT MYER DRIVE VA 11 104,705 -390 2025-01-31 98.5
SEA-LAND BUILDING (ON-AIRPORT) NJ 2 92,553 -25 2026-01-31 98.2
CAPITAL VIEW DC 11 76,200 -10 2026-02-15 97.8
RICK FINLEY BUILDING MI 5 78,272 -56 2025-12-31 97.8
SEATAC NORTH TOWER WA 10 74,797 -66 2025-12-21 97.7
BELTWAY LAKE PHASE I TX 7 110,492 27 2026-03-24 97.7
211 BUILDING WI 5 75,196 3 2026-02-28 97.6
68-94 S. ELMWOOD AVE NY 2 70,796 -10 2026-02-15 97.6
AVION MIDRISE IV VA 11 71,504 -25 2026-01-31 97.6
BELTSVILLE WAREHOUSE MD 11 116,987 34 2026-03-31 97.5
PREMIER TURNPIKE PK FL 4 110,687 46 2026-04-12 97.1
VA95 BUILDING 6 VA 11 79,971 34 2026-03-31 96.7
Exporting the key analytical outputs to the processed data folder. high_risk_leases.xlsx is driven directly by the risk model, sorted by score descending.
try:
# Export year by year lease cliff summary
if 'annual_summary' in locals():
annual_summary.reset_index().to_excel(OUTPUT_DIR / 'lease_cliff_analysis.xlsx', index=False)
print('*** Exported: lease_cliff_analysis.xlsx')
# Export state-level summary
if 'state_summary' in locals():
state_summary.to_excel(OUTPUT_DIR / 'state_summary.xlsx', index=False)
print('*** Exported: state_summary.xlsx')
# Export high risk lease priority list, ranked by composite risk score
if 'high_risk' in locals() and len(high_risk) > 0:
high_risk.sort_values('Lease_Risk_Score', ascending=False).to_excel(
OUTPUT_DIR / 'high_risk_leases.xlsx', index=False
)
print('*** Exported: high_risk_leases.xlsx')
# Export full master dataset with risk scores attached
master_df.to_excel(OUTPUT_DIR / 'master_dataset.xlsx', index=False)
print('*** Exported: master_dataset.xlsx')
print('\nAll files exported successfully!')
except Exception as e:
print(f'!!! Export error: {e} !!!')
*** Exported: lease_cliff_analysis.xlsx *** Exported: state_summary.xlsx *** Exported: high_risk_leases.xlsx *** Exported: master_dataset.xlsx All files exported successfully!